{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys,os\n",
    "import csv\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import scipy\n",
    "import numpy as np\n",
    "import IPython.display\n",
    "import pickle\n",
    "import json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Generation\n",
    "\n",
    "This notebook generates bag of words and labels from Manifesto datasets of countries. Each dataset is in the form of a json file whose records are single sentences. Each record has the following fields:\n",
    "\n",
    "- party: the party the sentence belongs to\n",
    "- year: the year of the manifesto\n",
    "- orientation: can be Left-wing, Right-wing or Other\n",
    "- text: the text of the sentence\n",
    "- cleaned_text: the stemmed text, with stop words removed\n",
    "\n",
    "Parties in the __parties_to_exclude__ dictionary below are those who will be excluded from the training.\n",
    "\n",
    "Parties in the __populist_parties__ dictionary are those that we will label as 1 in our final data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "parties_to_exclude = {\n",
    "    \"IT\":['Forward Italy', 'PdL', 'Italy of Values', 'Casapound', 'Houses of Freedom'],\n",
    "    \"FR\":['The Greens','French Communist Party', \"Nouveau Parti Anticapitaliste\", \"Resistons\",'Debout la France'],\n",
    "    \"AT\":['Peter Pilz List'],\n",
    "    \"NL\":['DENK','Party for the Animals','Reformed Political Party','50Plus','Green Left'],\n",
    "    \"ES\":['Amaiur','Andalusian Party','Aragonist Council','Basque Country Unite'\\\n",
    "          ,'Basque Nationalist Party','Basque Solidarity','Canarian Coalition','Catalan Republican Left'\\\n",
    "          ,'Citizens','Commitment-Q','Commitment-We can-It is time','Democratic Convergence of Catalonia'\\\n",
    "          ,'Forum Asturias','Future Yes','Galician Nationalist Bloc','In Tide',\"Navarrese People's Union\",'Valencian style'],\n",
    "    \"DE\":['Pirates']\n",
    "}\n",
    "\n",
    "populist_parties = {\n",
    "    \"IT\":['Northern League', 'PaP', 'M5S', 'Brothers of Italy'],\n",
    "    \"FR\":['National Front','Indomitable France'],\n",
    "    \"AT\":['Austrian Freedom Party','Alliance for the Future of Austria','Team Stronach for Austria'],\n",
    "    \"NL\":['Party of Freedom','List Pim Fortuyn','Socialist Party','Forum for Democracy'],\n",
    "    \"ES\":['We can','In Common We Can',\"Vox\"],\n",
    "    \"DE\":['The Left','Alternative for Germany']\n",
    "    \n",
    "}\n",
    "\n",
    "nations = populist_parties.keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Manifesto Datasets Bag of words and Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nation = IT\n",
      "finding all words...\n",
      "generating words indices...\n",
      "generating bag of words and labels...\n",
      "\n",
      "nation = FR\n",
      "finding all words...\n",
      "generating words indices...\n",
      "generating bag of words and labels...\n",
      "\n",
      "nation = AT\n",
      "finding all words...\n",
      "generating words indices...\n",
      "generating bag of words and labels...\n",
      "\n",
      "nation = NL\n",
      "finding all words...\n",
      "generating words indices...\n",
      "generating bag of words and labels...\n",
      "\n",
      "nation = ES\n",
      "finding all words...\n",
      "generating words indices...\n",
      "generating bag of words and labels...\n",
      "\n",
      "nation = DE\n",
      "finding all words...\n",
      "generating words indices...\n",
      "generating bag of words and labels...\n",
      "\n",
      "done\n"
     ]
    }
   ],
   "source": [
    "for nation in nations:\n",
    "    data = json.load(open(\"./datasets/{}_manifesto_sentences.json\".format(nation),\"r\"))\n",
    "\n",
    "    print(\"nation = {}\".format(nation))\n",
    "\n",
    "    print(\"finding all words...\")\n",
    "    counts = {}\n",
    "    N_sentences = 0\n",
    "    for record in data:\n",
    "        clean_text = record[\"clean_text\"]   \n",
    "\n",
    "        if record[\"party\"] in parties_to_exclude[nation]:continue\n",
    "\n",
    "        for word in clean_text:\n",
    "            try: counts[word]+=1\n",
    "            except KeyError: counts[word]=1\n",
    "\n",
    "        N_sentences+=1\n",
    "\n",
    "    print(\"generating words indices...\")\n",
    "\n",
    "    to_del= [word for word in counts if counts[word]<=4 or len(word)<=2]\n",
    "    for word in to_del: \n",
    "        del counts[word]\n",
    "\n",
    "    words_list = [w for w in counts.keys()]\n",
    "    word_index = {}\n",
    "    for w in words_list: word_index[w] = len(word_index)\n",
    "    N = len(word_index)\n",
    "\n",
    "\n",
    "\n",
    "    print(\"generating bag of words and labels...\")\n",
    "\n",
    "    X = np.zeros((N_sentences,N))\n",
    "    Y = np.zeros(N_sentences)\n",
    "    parties= []\n",
    "    years = []\n",
    "    orientations = []\n",
    "    i_party = []\n",
    "    i=0\n",
    "    for record in data:\n",
    "        clean_text = record[\"clean_text\"]\n",
    "        party = record[\"party\"]\n",
    "        year = record[\"year\"]\n",
    "        if party in parties_to_exclude[nation]:continue\n",
    "\n",
    "        for w in clean_text:\n",
    "            try: j = word_index[w]\n",
    "            except KeyError: continue\n",
    "            X[i,j] = 1\n",
    "\n",
    "        if party in populist_parties[nation]: Y[i] = 1\n",
    "        parties.append(party)\n",
    "        years.append(year)\n",
    "        orientations.append(record[\"orientation\"])\n",
    "        i+=1\n",
    "\n",
    "\n",
    "        \n",
    "    pickle.dump(X, open(\"./bow_and_labels/X_{}_sentences.pkl\".format(nation), \"wb\"))\n",
    "    pickle.dump(Y, open(\"./bow_and_labels/Y_{}_sentences.pkl\".format(nation), \"wb\"))\n",
    "    parties = np.array(parties)\n",
    "    pickle.dump(parties, open(\"./bow_and_labels/parties_{}_sentences.pkl\".format(nation), \"wb\"))\n",
    "    years = np.array(years)\n",
    "    pickle.dump(years, open(\"./bow_and_labels/years_{}_sentences.pkl\".format(nation), \"wb\"))\n",
    "    orientations = np.array(orientations)\n",
    "    pickle.dump(orientations, open(\"./bow_and_labels/orientations_{}_sentences.pkl\".format(nation), \"wb\"))\n",
    "                \n",
    "        \n",
    "    print()\n",
    "    \n",
    "print(\"done\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Italian Speeches Bag of words and Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "finding all words...\n",
      "generating words indices...\n",
      "generating bag of words and labels...\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data = json.load(open(\"./datasets/IT_speeches_sentences.json\",\"r\"))\n",
    "\n",
    "print(\"finding all words...\")\n",
    "counts = {}\n",
    "N_sentences = 0\n",
    "for record in data:\n",
    "    clean_text = record[\"clean_text\"]   \n",
    "\n",
    "    if record[\"party\"] in parties_to_exclude[\"IT\"]:continue\n",
    "\n",
    "    for word in clean_text:\n",
    "        try: counts[word]+=1\n",
    "        except KeyError: counts[word]=1\n",
    "\n",
    "    N_sentences+=1\n",
    "\n",
    "print(\"generating words indices...\")\n",
    "\n",
    "to_del= [word for word in counts if counts[word]<=4 or len(word)<=2]\n",
    "for word in to_del: \n",
    "    del counts[word]\n",
    "\n",
    "words_list = [w for w in counts.keys()]\n",
    "word_index = {}\n",
    "for w in words_list: word_index[w] = len(word_index)\n",
    "N = len(word_index)\n",
    "\n",
    "\n",
    "\n",
    "print(\"generating bag of words and labels...\")\n",
    "\n",
    "X = np.zeros((N_sentences,N))\n",
    "Y = np.zeros(N_sentences)\n",
    "parties= []\n",
    "years = []\n",
    "i_party = []\n",
    "orientations = []\n",
    "i=0\n",
    "for record in data:\n",
    "    clean_text = record[\"clean_text\"]\n",
    "    party = record[\"party\"]\n",
    "    year = record[\"year\"]\n",
    "    if party in parties_to_exclude[\"IT\"]:continue\n",
    "\n",
    "    for w in clean_text:\n",
    "        try: j = word_index[w]\n",
    "        except KeyError: continue\n",
    "        X[i,j] = 1\n",
    "\n",
    "    if party in populist_parties[\"IT\"]: Y[i] = 1\n",
    "        \n",
    "    parties.append(party)\n",
    "    years.append(year)\n",
    "    orientations.append(record[\"orientation\"])\n",
    "        \n",
    "\n",
    "    i+=1\n",
    "\n",
    "\n",
    "pickle.dump(X, open(\"./bow_and_labels/X_IT_speeches_sentences.pkl\", \"wb\"))\n",
    "pickle.dump(Y, open(\"./bow_and_labels/Y_IT_speeches_sentences.pkl\", \"wb\"))\n",
    "parties = np.array(parties)\n",
    "pickle.dump(parties, open(\"./bow_and_labels/parties_IT_speeches_sentences.pkl\", \"wb\"))\n",
    "years = np.array(years)\n",
    "pickle.dump(years, open(\"./bow_and_labels/years_IT_speeches_sentences.pkl\", \"wb\"))\n",
    "orientations = np.array(orientations)\n",
    "pickle.dump(orientations, open(\"./bow_and_labels/orientations_IT_speeches_sentences.pkl\", \"wb\"))\n",
    "\n",
    "print()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Italian Manual Annotations Bag of words and Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "finding all words...\n",
      "generating words indices...\n",
      "generating bag of words and labels...\n",
      "\n"
     ]
    }
   ],
   "source": [
    "data = json.load(open(\"./datasets/IT_manual_sentences.json\",\"r\"))\n",
    "\n",
    "print(\"finding all words...\")\n",
    "counts = {}\n",
    "N_sentences = 0\n",
    "for record in data:\n",
    "    clean_text = record[\"clean_text\"]   \n",
    "    for word in clean_text:\n",
    "        try: counts[word]+=1\n",
    "        except KeyError: counts[word]=1\n",
    "\n",
    "    N_sentences+=1\n",
    "\n",
    "print(\"generating words indices...\")\n",
    "\n",
    "to_del= [word for word in counts if counts[word]<=4 or len(word)<=2]\n",
    "for word in to_del: \n",
    "    del counts[word]\n",
    "\n",
    "words_list = [w for w in counts.keys()]\n",
    "word_index = {}\n",
    "for w in words_list: word_index[w] = len(word_index)\n",
    "N = len(word_index)\n",
    "\n",
    "\n",
    "\n",
    "print(\"generating bag of words and labels...\")\n",
    "\n",
    "X = np.zeros((N_sentences,N))\n",
    "Y = np.zeros(N_sentences)\n",
    "parties = []\n",
    "years = []\n",
    "i_party = []\n",
    "orientations = []\n",
    "i=0\n",
    "for record in data:\n",
    "    clean_text = record[\"clean_text\"]\n",
    "    party = record[\"party\"]\n",
    "    year = record[\"year\"]\n",
    "\n",
    "    for w in clean_text:\n",
    "        try: j = word_index[w]\n",
    "        except KeyError: continue\n",
    "        X[i,j] = 1\n",
    "\n",
    "    if record[\"is_populist\"]: Y[i] = 1\n",
    "    parties.append(party)\n",
    "    years.append(year)\n",
    "    orientations.append(record[\"orientation\"])\n",
    "\n",
    "    i+=1\n",
    "\n",
    "\n",
    "pickle.dump(X, open(\"./bow_and_labels/X_IT_manual_sentences.pkl\", \"wb\"))\n",
    "pickle.dump(Y, open(\"./bow_and_labels/Y_IT_manual_sentences.pkl\", \"wb\"))\n",
    "parties = np.array(parties)\n",
    "pickle.dump(parties, open(\"./bow_and_labels/parties_IT_manual_sentences.pkl\", \"wb\"))\n",
    "years = np.array(years)\n",
    "pickle.dump(years, open(\"./bow_and_labels/years_IT_manual_sentences.pkl\", \"wb\"))\n",
    "orientations = np.array(orientations)\n",
    "pickle.dump(years, open(\"./bow_and_labels/orientations_IT_manual_sentences.pkl\", \"wb\"))\n",
    "\n",
    "print()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
